23.2.1 (#1, #2)

  1. One downside of the linear model is that it is sensitive to unusual values because the distance incorporates a squared term. Fit a linear model to the simulated data below, and visualise the results. Rerun a few times to generate different simulated datasets. What do you notice about the model?

set.seed(0)
suppressMessages(library(dplyr))
sim1a <- tibble(
  x = rep(1:10, each = 3),
  y = x * 1.5 + 6 + rt(length(x), df = 2)
)


suppressMessages(library(pander))
sim1a.model <- lm(y ~ x, sim1a)
pander(sim1a.model)
Fitting linear model: y ~ x
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.727 2.228 3.468 0.001712
x 1.12 0.3591 3.118 0.004188
suppressMessages(library(ggplot2))
suppressMessages(library(plotly))

sim1a.df <- as.data.frame(sim1a)
sim1a <- cbind(sim1a, id = 'a')

plot1.gg <- ggplot(aes(x = x, y = y), data = sim1a.df) + geom_point() + geom_line(stat ="smooth", method = "lm")
ggplotly(plot1.gg)
simfunction <- function() {
  as.data.frame(
      tibble(
        x = rep(1:10, each = 3),
        y = x * 1.5 + 6 + rt(length(x), df = 2)
    )
  )
}

sims.df <- sim1a
for(letter in letters[-1]) {
  sims.df <- rbind(sims.df, cbind(simfunction(),id = letter))
}

sims.gg <- ggplot(aes(x = x, y = y, group = id, color = id), data = sims.df) +
  geom_point() +
  geom_line(stat ="smooth", method = "lm")
ggplotly(sims.gg)
sims.gg +
  facet_wrap(~id, ncol = 4, scales = 'free_y')

Expected Y intercept : 6 Expected slope : 1.5

models <- plyr::dlply(sims.df, "id", function(df) lm(y ~ x, data = df))
pander(models)

Some of the models get prety close but when there are one or two values on the high or low end that are extremely off it throws the slope of the linear model completely off. This is also partially due to the fact that there is only 30 samples so a few extremes throw off the squared values that are used to minimize the linear model.

  1. One way to make linear models more robust is to use a different distance measure. For example, instead of root-mean-squared distance, you could use mean-absolute distance:

make_prediction <- function(a, data) {
  data$x * a[1] + a[2]
}

measure_distance <- function(mod, data) {
  diff <- data$y - make_prediction(mod, data)
  mean(abs(diff))
}

Use `optim()’ to fit this model to the simulated data above and compare it to the linear model.

pander(optim(c(0,0), measure_distance, data = sim1a))

Optim found that an a[1] of 1.51 and an a[2] of 6.027 on sim1a where lm(y~x) found an a[1] of 1.12 and an a[2] of 7.27 on the same input data.

abs.model <- plyr::dlply(sims.df, "id", function(df) optim(c(0,0), measure_distance, data = df))
pars <- abs.model[1]$par
for(model in abs.model) {
  pars <- rbind(pars, model$par)
}
pars.lm <- data.frame("lm.m" = double(), "lm.b" = double(), stringsAsFactors = FALSE)
for(model in models) {
  pars.lm <- rbind(pars.lm, model$coefficients)
}

colnames(pars) <- c("optim.m","optim.b")
colnames(pars.lm) <- c("lm.b","lm.m")

pars.all <- cbind(pars, pars.lm, letters)

ggplot(aes(x = as.factor(letters)), data = pars.all) +
  geom_point(aes(y = optim.m, color = 'optim.m')) +
  geom_point(aes(y = lm.m, color = 'lm.m')) +
  geom_point(aes(y = optim.b, color = 'optim.b')) +
  geom_point(aes(y = lm.b, color = 'lm.b')) +
  theme(legend.position = 'top') +
  ylab("Predicted Value") + xlab("Simulation Number")

ggplot(data = pars.all) +
  geom_boxplot(aes(x = 'optim', y = optim.m, color = 'optim.m')) +
  geom_boxplot(aes(x = 'lm', y = lm.m, color = 'lm.m')) +
  geom_boxplot(aes(x = 'optim', y = optim.b, color = 'optim.b')) +
  geom_boxplot(aes(x = 'lm', y = lm.b, color = 'lm.b')) +
  theme(legend.position = 'top') +
  ylab("Predicted Value") + xlab("Prediction Model")

Optim() with the fitted model vs the linear model also has its own innacuracies. However as shown from the box plots its calculated values of m and b for the Optim() models much more tigtly wrap the expected values than the lm() predictions

23.3.3 (#1, #3, #4)

  1. Instead of using lm() to fit a straight line, you can use loess() to fit a smooth curve. Repeat the process of model fitting, grid generation, predictions, and visualisation on sim1 using loess() instead of lm(). How does the result compare to geom_smooth()?
gg.geom_line <- ggplot(aes(x=x,y=y), data = sim1a) +
  geom_line(aes(color = "geom_line, smooth, lm"), stat="smooth", method = "lm", alpha = .5) +
  geom_line(aes(color = "geom_line, smooth, loess"), stat="smooth", method = "loess", alpha = .5) +
  geom_point() +
  theme(legend.position = 'none')

gg.geom_smooth <- ggplot(aes(x=x,y=y), data = sim1a) +
  geom_smooth(method='loess') +
  geom_point()

suppressMessages(library(gridExtra))
grid.arrange(gg.geom_line,gg.geom_smooth, ncol = 2)

As far as I can tell the visualization of the line done by geom_line() with method = "loess" and stat = "smooth" is identical to geom_smooth() with the exception being that by default geom_smooth() shows a confidence interval around the fitted line, which can be hidden by adding se = FALSE to geom_smooth().

summary(loess(y~x, sim1a))
## Call:
## loess(formula = y ~ x, data = sim1a)
## 
## Number of Observations: 30 
## Equivalent Number of Parameters: 4.19 
## Residual Standard Error: 5.907 
## Trace of smoother matrix: 4.58  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE
sims.gg.loess <- ggplot(aes(x = x, y = y, group = id, color = id), data = sims.df) +
  geom_point() +
  geom_line(stat ="smooth", method = "loess")
ggplotly(sims.gg.loess)
ggplotly(sims.gg.loess +
  facet_wrap(~id, ncol = 2, scales = 'free_y'))

For whatever reason my y sim isn’t showing up so here it is.

y.gg <- sims.df %>%
  filter(id == 'y') %>%
    ggplot(aes(x = x, y = y, color = id)) +
    geom_point() +
    geom_line(stat = "smooth", method = "loess")

ggplotly(y.gg)

  1. What does geom_ref_line() do? What package does it come from? Why is displaying a reference line in plots showing residuals useful and important?

geom_ref_line() comes from the modelr package. It is used to add a reference line to a ggplot2 plot. It can be useful for putting a line through a value of interest to give a visual reference to how far points are from an expected value. This is especially useful when the x axis is non numeric.

  1. Why might you want to look at a frequency polygon of absolute residuals? What are the pros and cons compared to looking at the raw residuals?

A frequency poygon of the absolute residuals might give you a better idea of where the true values that your model is predicting might lie. In theory given a large enough sample size the frequency polygon should be most dense by the true value that your model is looking for. It can also make it quite visually obvious if there are any glaring issues with your model. A huge frequency not near zero means that something is wrong in your model or data that you are probably not accounting because the residuals should be especially dense around zero.

suppressMessages(library(modelr))
sim1a.res <- sim1a %>% add_residuals(sim1a.model)
ggplot(aes(resid), data = sim1a.res) + geom_freqpoly(binwidth = .5)

In this case there is definately a datapoint in sim1a that is way off of what is expected by the simple linear model. But for the most part the lm() did a pretty good job with its fit.

23.4.5 (#1, #3, #4)

  1. What happens if you repeat the analysis of sim2 using a model without an intercept. What happens to the model equation? What happens to the predictions?

With intercept
ggplot(sim2) + geom_point(aes(x = x, y = y))

mod <- lm(y ~ x, data = sim2)

pander(mod)
Fitting linear model: y ~ x
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.152 0.3475 3.316 0.002095
xb 6.964 0.4914 14.17 2.684e-16
xc 4.975 0.4914 10.12 4.469e-12
xd 0.7588 0.4914 1.544 0.1313
grid <- sim2 %>%
  data_grid(x) %>%
  add_predictions(mod)

pander(grid)
x pred
a 1.152
b 8.116
c 6.127
d 1.911
ggplot(sim2, aes(x)) +
  geom_point(aes(y = y)) +
  geom_point(data = grid, aes(y = pred), colour = "red", size = 4)

Without intercept
mod2 <- lm(y ~ 0 + x, data = sim2)

pander(mod2)
Fitting linear model: y ~ 0 + x
  Estimate Std. Error t value Pr(>|t|)
xa 1.152 0.3475 3.316 0.002095
xb 8.116 0.3475 23.36 2.428e-23
xc 6.127 0.3475 17.63 2.709e-19
xd 1.911 0.3475 5.499 3.245e-06
grid2 <- sim2 %>%
  data_grid(x) %>%
  add_predictions(mod2)

pander(grid2)
x pred
a 1.152
b 8.116
c 6.127
d 1.911
ggplot(sim2, aes(x)) +
  geom_point(aes(y = y)) +
  geom_point(data = grid2, aes(y = pred), colour = "red", size = 4)

Without the intercept the model of the equation is on xa xb xc and xd instead of intercept and x{b,c,d} however the t values are much larger and the Pr(>|t|) or levels of signifigance is much much smaller which is better.

  1. Using the basic principles, convert the formulas in the following two models into functions. (Hint: start by converting the categorical variable into 0-1 variables.)
mod1 <- lm(y ~ x1 + x2, data = sim3)
mod1.func <- function(a,b,c,data) {
  lm(data[[a]] ~ data[[b]] + data[[c]], data = data)
}
mod1.f <- mod1.func(4,1,2,sim3)

pander(mod1)
Fitting linear model: y ~ x1 + x2
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.872 0.3874 4.832 4.218e-06
x1 -0.1967 0.04871 -4.039 9.718e-05
x2b 2.888 0.3957 7.298 4.066e-11
x2c 4.806 0.3957 12.14 2.42e-22
x2d 2.36 0.3957 5.963 2.787e-08
pander(mod1.f)
Fitting linear model: data[[a]] ~ data[[b]] + data[[c]]
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.872 0.3874 4.832 4.218e-06
data[[b]] -0.1967 0.04871 -4.039 9.718e-05
data[[c]]b 2.888 0.3957 7.298 4.066e-11
data[[c]]c 4.806 0.3957 12.14 2.42e-22
data[[c]]d 2.36 0.3957 5.963 2.787e-08
mod2 <- lm(y ~ x1 * x2, data = sim3)
mod2.func <- function(a,b,c,data) {
  lm(data[[a]] ~ data[[b]] * data[[c]], data = data)
}
mod2.f <- mod2.func(4,1,2,sim3)

pander(mod2)
Fitting linear model: y ~ x1 * x2
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.301 0.404 3.221 0.001673
x1 -0.09302 0.06511 -1.429 0.1559
x2b 7.069 0.5713 12.37 1.092e-22
x2c 4.431 0.5713 7.755 4.407e-12
x2d 0.8346 0.5713 1.461 0.1469
x1:x2b -0.7603 0.09208 -8.257 3.301e-13
x1:x2c 0.06815 0.09208 0.7402 0.4608
x1:x2d 0.2773 0.09208 3.011 0.003216
pander(mod2.f)
Fitting linear model: data[[a]] ~ data[[b]] * data[[c]]
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.301 0.404 3.221 0.001673
data[[b]] -0.09302 0.06511 -1.429 0.1559
data[[c]]b 7.069 0.5713 12.37 1.092e-22
data[[c]]c 4.431 0.5713 7.755 4.407e-12
data[[c]]d 0.8346 0.5713 1.461 0.1469
data[[b]]:data[[c]]b -0.7603 0.09208 -8.257 3.301e-13
data[[b]]:data[[c]]c 0.06815 0.09208 0.7402 0.4608
data[[b]]:data[[c]]d 0.2773 0.09208 3.011 0.003216

  1. For sim4, which of mod1 and mod2 is better? I think mod2 does a slightly better job at removing patterns, but it’s pretty subtle. Can you come up with a plot to support my claim?

mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)

pander(mod1)
Fitting linear model: y ~ x1 + x2
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.03546 0.1218 0.291 0.7712
x1 1.822 0.1909 9.543 5.326e-19
x2 -2.783 0.1909 -14.58 1.12e-36
pander(mod2)
Fitting linear model: y ~ x1 * x2
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.03546 0.1199 0.2956 0.7677
x1 1.822 0.1879 9.694 1.773e-19
x2 -2.783 0.1879 -14.81 1.668e-37
x1:x2 0.9523 0.2944 3.235 0.001356
grid <- sim4 %>%
  data_grid(
    x1 = seq_range(x1, 10),
    x2 = seq_range(x2, 10)
  ) %>%
  gather_predictions(mod1, mod2)

ggplotly(ggplot(grid, aes(x1, x2)) +
  geom_tile(aes(fill = pred)) +
  facet_wrap(~ model, ncol = 1))
ggplotly(ggplot(aes(x = x1, y = x2),data = sim4) + geom_point(aes(color = y, size = 2)))

Its kind of hard to tell but mod2 does a better job as its just a tad bit darker in its graph in the upper left quartile which can be seen in the geom_point plot above. This is a little bit hard to tell purely visually but thanks to plotly::ggplotly() You can see the higher predictions in that upper quartile in mod2’s plot.